# Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
import math
from sklearn.metrics import r2_score
from sklearn import metrics
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
# Importing Wine Data
wine_data = pd.read_csv("/Users/vishruta/Downloads/winequality-red.csv",low_memory =False )
wine_data
wine_data.isnull().sum()
## Quality plot
sns.catplot(x='quality', data = wine_data, kind='count')
## Correlation plot
plt.figure(figsize = (15,15))
cmap = sns.light_palette((210, 90, 60), input="husl")
#cmap= sns.color_palette("PuBuGn_d")
sns.heatmap(wine_data.corr(), cmap= cmap, annot=True, square=True)
plt.title("Correlation Plot")
#Correlation with Quality with respect to attributes
wine_data.corrwith(wine_data.quality).plot.bar(
figsize = (20, 10), title = "Correlation with quality", fontsize = 15,
rot = 45, grid = True)
# Converting Quality to 0 and 1
plt.figure(figsize=(10,20))
wine_data['quality_new1'] = (wine_data['quality'] >= 5.5)*1
wine_data['quality_new2'] = (wine_data['quality'] > 5)*1
wine_data['quality_new3'] = (wine_data['quality'] >= 6)*1
wine_data['quality_new4'] = (wine_data['quality'] > 6.5)*1
print('For >= 5.5', '\n' , wine_data.quality_new1.value_counts())
print('For > 5', '\n' ,wine_data.quality_new2.value_counts())
print('For >= 6', '\n' ,wine_data.quality_new3.value_counts())
print('For > 6.5', '\n' ,wine_data.quality_new4.value_counts())
#sns.catplot(x='quality_new1', data = wine_data, kind='count')
#wine_data.quality_new1.value_counts()
wine_data = wine_data.drop(['quality_new1','quality_new2','quality_new3','quality_new4'],axis =1)
wine_data
features = ['fixed acidity','volatile acidity','citric acid','residual sugar','chlorides',
'free sulfur dioxide','total sulfur dioxide','density','pH','sulphates']
target=['quality']
X = wine_data[features]
y = wine_data[target]
# Perform train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=200)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
linear_regression = LinearRegression()
linear_regression_fit = linear_regression.fit(X_train, y_train)
linear_regression_prediction = linear_regression.predict(X_test)
# Measuring Performance Metrices
linear_regression_accuracy_train = linear_regression.score(X_train,y_train)
linear_regression_accuracy_test = linear_regression.score(X_test,y_test)
print('Accuracy on Train dataset', linear_regression_accuracy_train)
print('Accuracy on Test dataset', linear_regression_accuracy_test)
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, linear_regression_prediction)))
print('R^2 of Train dataset:', r2_score(y_train, linear_regression.predict(X_train)))
print('R^2 of Test dataset:', r2_score(y_test, linear_regression_prediction))
#### Regularization using Ridge and Lasso Regression
#### Ridge Regression ####
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split, cross_val_score
from statistics import mean
# List to maintain the different cross-validation scores
cross_val_scores_ridge = []
# List to maintain the different values of alpha
alpha = []
# Loop to compute the different values of cross-validation scores
for i in range(1, 9):
ridgeModel = Ridge(alpha = i * 0.25)
ridgeModel.fit(X_train, y_train)
scores = cross_val_score(ridgeModel, X, y, cv = 10)
avg_cross_val_score = mean(scores)*100
cross_val_scores_ridge.append(avg_cross_val_score)
alpha.append(i * 0.25)
# Loop to print the different values of cross-validation scores
for i in range(0, len(alpha)):
print(str(alpha[i])+' : '+str(cross_val_scores_ridge[i]))
# Performance Metrices Ridge using alpha as 0.25
from sklearn.linear_model import Ridge
# Building and fitting the Ridge Regression model
ridgeModelChosen = Ridge(alpha = 0.25)
ridgeModelChosen.fit(X_train, y_train)
# Evaluating the Ridge Regression model
print('Ridge Regression')
print('Accuracy on Train dataset:', ridgeModelChosen.score(X_train, y_train))
print('Accuracy on Test dataset:', ridgeModelChosen.score(X_test, y_test))
#### Lasso Regression ####
# List to maintain the cross-validation scores
cross_val_scores_lasso = []
# List to maintain the different values of Lambda
Lambda = []
# Loop to compute the cross-validation scores
for i in range(1, 9):
lassoModel = Lasso(alpha = i * 0.25, tol = 0.0925)
lassoModel.fit(X_train, y_train)
scores = cross_val_score(lassoModel, X, y, cv = 10)
avg_cross_val_score = mean(scores)*100
cross_val_scores_lasso.append(avg_cross_val_score)
Lambda.append(i * 0.25)
# Loop to print the different values of cross-validation scores
for i in range(0, len(alpha)):
print(str(alpha[i])+' : '+str(cross_val_scores_lasso[i]))
# Building and fitting the Lasso Regression Model
lassoModelChosen = Lasso(alpha = 0.25)
lassoModelChosen.fit(X_train, y_train)
# Evaluating the Lasso Regression model
print('Lasso Regression')
print('Accuracy on Train dataset:', lassoModelChosen.score(X_train, y_train))
print('Accuracy on Test dataset:', lassoModelChosen.score(X_test, y_test))
wine_data['quality_new'] = (wine_data['quality'] >= 6)*1
wine_data1=wine_data.drop(['quality'], axis=1)
sns.catplot(x='quality_new', data = wine_data1, kind='count')
wine_data1.quality_new.value_counts()
features_log = ['fixed acidity','volatile acidity','citric acid','residual sugar','chlorides',
'free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol']
target_classifier = ['quality_new']
X1 = wine_data1[features_log]
y1 = wine_data1[target_classifier]
# Perform train and test split
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.33, random_state=324)
###### Logistic Regression ######
# Fit on train set
from sklearn.linear_model import LogisticRegression
logistic_regression = LogisticRegression()
logistic_regression.fit(X1_train, y1_train)
logistic_regression_prediction = logistic_regression.predict(X1_test)
from sklearn.metrics import confusion_matrix
logistic_regression_accuracy_test = accuracy_score(y1_test,logistic_regression_prediction)
logistic_regression_accuracy_train = accuracy_score(y1_train,logistic_regression.predict(X1_train))
logistic_regression_cnf_mat = confusion_matrix(y1_test, logistic_regression_prediction)
print('Accuracy of Train Dataset:',logistic_regression_accuracy_train)
print('Accuracy of Test Dataset:',logistic_regression_accuracy_test)
print("Precision:",metrics.precision_score(y1_test, logistic_regression_prediction))
print("Recall:",metrics.recall_score(y1_test,logistic_regression_prediction))
print('Confusion matrix:', '\n',logistic_regression_cnf_mat)
# Classification Report
from sklearn.metrics import classification_report
logistic_regression_cls_rep = classification_report(y1_test, logistic_regression_prediction)
print('Classification Report of Logistic Model:', '\n',logistic_regression_cls_rep)
class_names=[0,1] # name of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(logistic_regression_cnf_mat), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
#Text(0.5,257.44,'Predicted label')
# Receiver Operating Curve
y1_pred_proba1 = logistic_regression.predict_proba(X1_test)[::,1]
fpr, tpr, thresholds = metrics.roc_curve(y1_test, y1_pred_proba1)
auc = metrics.roc_auc_score(y1_test, y1_pred_proba1)
plt.plot(fpr,tpr,label="ROC, auc="+str(auc))
plt.legend(loc=4)
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("Receiver Operating Curve for Logistic Model")
plt.show()
# Importing libraries for Ridge, Lasso
from statistics import mean
# List to maintain the different cross-validation scores
cross_val_scores_ridge = []
# List to maintain the different values of alpha
alpha = []
# Loop to compute the different values of cross-validation scores
for i in range(1, 9):
ridgeModel = Ridge(alpha = i * 0.25)
ridgeModel.fit(X1_train, y1_train)
scores = cross_val_score(ridgeModel, X1, y1, cv = 10)
avg_cross_val_score = mean(scores)*100
cross_val_scores_ridge.append(avg_cross_val_score)
alpha.append(i * 0.25)
# Loop to print the different values of cross-validation scores
for i in range(0, len(alpha)):
print(str(alpha[i])+' : '+str(cross_val_scores_ridge[i]))
# Performance Metrices Ridge using alpha as 1.5
from sklearn.linear_model import Ridge
# Building and fitting the Ridge Regression model
ridgeModelChosen = Ridge(alpha = 1.5)
ridgeModelChosen.fit(X1_train, y1_train)
# Evaluating the Ridge Regression model
print('Accuracy on Train dataset:', ridgeModelChosen.score(X1_train, y1_train))
print('Accuracy on Test dataset:', ridgeModelChosen.score(X1_test, y1_test))
# Lasso Regression
# List to maintain the cross-validation scores
cross_val_scores_lasso = []
# List to maintain the different values of Lambda
Lambda = []
# Loop to compute the cross-validation scores
for i in range(1, 9):
lassoModel = Lasso(alpha = i * 0.25, tol = 0.0925)
lassoModel.fit(X1_train, y1_train)
scores = cross_val_score(lassoModel, X1, y1, cv = 10)
avg_cross_val_score = mean(scores)*100
cross_val_scores_lasso.append(avg_cross_val_score)
Lambda.append(i * 0.25)
# Loop to print the different values of cross-validation scores
for i in range(0, len(alpha)):
print(str(alpha[i])+' : '+str(cross_val_scores_lasso[i]))
# Building and fitting the Lasso Regression Model
lassoModelChosen = Lasso(alpha = 0.25)
lassoModelChosen.fit(X1_train, y1_train)
# Evaluating the Lasso Regression model
print('Accuracy on Train dataset:', lassoModelChosen.score(X1_train, y1_train))
print('Accuracy on Test dataset:', lassoModelChosen.score(X1_test, y1_test))
###### Decision Tree #######
import time
from datetime import date
import warnings
warnings.filterwarnings('ignore')
start_time = time.time()
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
decision_tree = DecisionTreeClassifier(max_leaf_nodes=20, random_state=0)
decision_tree.fit(X1_train,y1_train)
today = date.today()
print("Run Time: %s seconds" % (time.time() - start_time))
decision_tree_predict = decision_tree.predict(X1_test)
decision_tree_acc_score_train = accuracy_score(y1_train, decision_tree.predict(X1_train))
decision_tree_acc_score_test = accuracy_score(y1_test, decision_tree_predict)
decision_tree_recall = metrics.recall_score(y1_test, decision_tree_predict)
# Calculate Confusion Matrix
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, recall_score, precision_score
decision_tree_conf_matrix = confusion_matrix(y1_test,decision_tree_predict)
print('confusion matrix: ','\n',decision_tree_conf_matrix)
from sklearn import metrics
print("Recall:",metrics.recall_score(y1_test, decision_tree_predict))
print("Accuracy on Training set:", decision_tree_acc_score_train)
print("Accuracy on Test set:", decision_tree_acc_score_test)
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO
from IPython.display import Image
import pydotplus
dt_feature_names = list(X1.columns)
dot_data = StringIO()
export_graphviz(decision_tree, out_file=dot_data,
filled=True, rounded=False,
special_characters=True, feature_names=dt_feature_names)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())
class_names=[0,1] # name of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(decision_tree_conf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix Decision Tree', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
#Text(0.5,257.44,'Predicted label')
# Receiver Operating Curve
y2_pred_proba2 = decision_tree.predict_proba(X1_test)[::,1]
fpr, tpr, thresholds = metrics.roc_curve(y1_test, y2_pred_proba2)
auc = metrics.roc_auc_score(y1_test, y2_pred_proba2)
plt.plot(fpr,tpr,label="ROC, auc="+str(auc))
plt.legend(loc=4)
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("Receiver Operating Curve for Decision Tree Model")
plt.show()
# Classification Report
from sklearn.metrics import classification_report
decision_tree_cls_rep = classification_report(y1_test, decision_tree_predict)
print('Classification Report of Decision Tree:', '\n',decision_tree_cls_rep)
### Optimization using GridSearch
from sklearn import decomposition, datasets
from sklearn import tree
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_val_predict
from sklearn.preprocessing import StandardScaler
tuned_parameters = {'criterion' : ['gini', 'entropy'],'max_depth':[4,6,8,12],'random_state':[14]}
# random_state ensures repeatable results
dt_clf = GridSearchCV(DecisionTreeClassifier(), tuned_parameters, cv=5, scoring='roc_auc')
dt_clf.fit(X1_train, y1_train)
print('The best model is: ', dt_clf.best_params_)
print('This model produces a mean cross-validated score (auc) of', dt_clf.best_score_)
from sklearn.metrics import precision_score, accuracy_score
print("GridSearch Optimization")
y1_true, y1_pred2 = y1_test, dt_clf.predict(X1_test)
decision_tree_opt3_accuracy_test = accuracy_score(y1_true, y1_pred2)
decision_tree_opt3_accuracy_train = accuracy_score(y1_train , dt_clf.predict(X1_train))
decision_tree_opt3_recall = metrics.recall_score(y1_true, y1_pred2)
decision_tree_opt3_conf_matrix = confusion_matrix(y1_true, y1_pred2)
print('Precision on the evaluation set: ', precision_score(y1_true, y1_pred2))
print('Accuracy on the evaluation set: ', accuracy_score(y1_true, y1_pred2))
print("Recall:",metrics.recall_score(y1_true, y1_pred2))
print('Confusion Matrix: ','\n',confusion_matrix(y1_true, y1_pred2))
# Classification Report
from sklearn.metrics import classification_report
decision_tree_opt3_cls_rep = classification_report(y1_test, dt_clf.predict(X1_test))
print('Classification Report of Decision Tree (GridSearch):', '\n',decision_tree_opt3_cls_rep)
class_names=[0,1] # name of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(decision_tree_opt3_conf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix Decision Tree GridSearch', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
#Text(0.5,257.44,'Predicted label')
# Receiver Operating Curve
y5_pred_proba5 = decision_tree.predict_proba(X1_test)[::,1]
fpr, tpr, thresholds = metrics.roc_curve(y1_test, y5_pred_proba5)
auc = metrics.roc_auc_score(y1_test, y5_pred_proba5)
plt.plot(fpr,tpr,label="ROC, auc="+str(auc))
plt.legend(loc=4)
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("Receiver Operating Curve for Decision Tree Model GridSearch")
plt.show()
###### Random Forest #########
start_time = time.time()
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(max_depth = 2, random_state = 0)
random_forest.fit(X1_train,y1_train)
today = date.today()
print("Run Time: %s seconds" % (time.time() - start_time))
random_forest_predict = random_forest.predict(X1_test)
random_forest_acc_score_train = accuracy_score(y1_train, random_forest.predict(X1_train))
random_forest_acc_score_test = accuracy_score(y1_test, random_forest_predict)
random_forest_acc_score_recall = metrics.recall_score(y1_test, random_forest_predict)
# Calculate Confusion Matrix
random_forest_conf_matrix = confusion_matrix(y1_test,random_forest_predict)
print('confusion matrix: ','\n',random_forest_conf_matrix)
from sklearn import metrics
print("Recall:",random_forest_acc_score_recall)
print("Accuracy on Training set:", random_forest_acc_score_train)
print("Accuracy on Test set:", random_forest_acc_score_test)
class_names=[0,1] # name of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(random_forest_conf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix Random Forest ', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
#Text(0.5,257.44,'Predicted label')
# Receiver Operating Curve
y3_pred_proba3 = random_forest.predict_proba(X1_test)[::,1]
fpr, tpr, thresholds = metrics.roc_curve(y1_test, y3_pred_proba3)
auc = metrics.roc_auc_score(y1_test, y3_pred_proba3)
plt.plot(fpr,tpr,label="ROC, auc="+str(auc))
plt.legend(loc=4)
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("Receiver Operating Curve for Random Forest Model")
plt.show()
# Classification Report
from sklearn.metrics import classification_report
random_forest_cls_rep = classification_report(y1_test, random_forest_predict)
print('Classification Report of Random Forest:', '\n',random_forest_cls_rep)
# Optimization using GridSearch
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
tuned_parameters = {'max_features': [0.5,0.6,0.7,0.8,0.9,1.0],
'max_depth': [2,3,4,5,6,7],'min_samples_leaf':[1,10,100],'random_state':[14]}
# random_state ensures repeatable results
rf_clf = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=5, scoring='roc_auc')
rf_clf.fit(X1_train, y1_train)
print('The best model is: ', rf_clf.best_params_)
print('This model produces a mean cross-validated score (auc) of', rf_clf.best_score_)
from sklearn.metrics import precision_score, accuracy_score
print("GridSearch Optimization")
y3_true1, y3_pred4 = y1_test, rf_clf.predict(X1_test)
random_forest_opt1_accuracy_test = accuracy_score(y3_true1, y3_pred4)
random_forest_opt1_accuracy_train = accuracy_score(y1_train, rf_clf.predict(X1_train))
random_forest_opt1_recall = metrics.recall_score(y3_true1, y3_pred4)
print('Precision on the evaluation set: ', precision_score(y3_true1, y3_pred4))
print('Accuracy on the evaluation set: ', accuracy_score(y3_true1, y3_pred4))
print("Recall:",random_forest_opt1_recall)
print('Confusion Matrix: ','\n',confusion_matrix(y3_true1, y3_pred4))
# Classification Report
from sklearn.metrics import classification_report
random_forest_opt1_cls_rep = classification_report(y1_test, rf_clf.predict(X1_test))
print('Classification Report of Random Forest(GridSearch):', '\n',random_forest_opt1_cls_rep)
class_names=[0,1] # name of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(confusion_matrix(y3_true1, y3_pred4)), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix Random Forest GridSearch ', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
#Text(0.5,257.44,'Predicted label')
# Receiver Operating Curve
y6_pred_proba6 = random_forest.predict_proba(X1_test)[::,1]
fpr, tpr, thresholds = metrics.roc_curve(y1_test, y6_pred_proba6)
auc = metrics.roc_auc_score(y1_test, y6_pred_proba6)
plt.plot(fpr,tpr,label="ROC, auc="+str(auc))
plt.legend(loc=4)
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("Receiver Operating Curve for Random Forest using GridSearch")
plt.show()
# Optimization using Random Search
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn import ensemble
param_dist={'n_estimators':[100,200,300,400,500,600],'criterion':['gini','entropy'],'max_depth':randint(1,15),'max_features':randint(1,9),'min_samples_leaf':randint(1,9)}
rf1_clf=ensemble.RandomForestClassifier()
rf1_clf_cv=RandomizedSearchCV(rf1_clf,param_distributions=param_dist,cv=5)
rf1_clf_cv.fit(X1_train,y1_train)
print("Tuned Random Forest Parameters: {}".format(rf1_clf_cv.best_params_))
print("Best score is {}".format(rf1_clf_cv.best_score_))
# Putting these hyperparameters into our model
rf1_clf=ensemble.RandomForestClassifier(criterion='entropy',max_depth=10,max_features=6,min_samples_leaf=1,n_estimators=400)
rf1_clf.fit(X1_train,y1_train)
print("RandomSearch optimization")
today = date.today()
print("Run Time: %s seconds" % (time.time() - start_time))
rf1_clf1 = rf1_clf.predict(X1_test)
random_forest_opt2_accuracy = accuracy_score(y1_test, rf1_clf1)
rf1_clf1_conf_matrix = confusion_matrix(y1_test,rf1_clf1)
print('confusion matrix: ','\n',rf1_clf1_conf_matrix)
print("Recall:",metrics.recall_score(y1_test,rf1_clf1))
print("Accuracy on Training set:", accuracy_score(y1_train, rf1_clf.predict(X1_train)))
print("Accuracy on Test set:", random_forest_opt2_accuracy)
# Classification Report
from sklearn.metrics import classification_report
random_forest_opt2_cls_rep = classification_report(y1_test, rf1_clf.predict(X1_test))
print('Classification Report of Random Forest(Random Search):', '\n',random_forest_opt2_cls_rep)
class_names=[0,1] # name of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(rf1_clf1_conf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix Random Forest Random Search ', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
#Text(0.5,257.44,'Predicted label')
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO
from IPython.display import Image
import pydotplus
estimator = rf1_clf.estimators_[5]
dt_feature_names = list(X1.columns)
export_graphviz(estimator,
out_file='tree.dot',
feature_names = dt_feature_names,
rounded = True, proportion = False,
precision = 2, filled = True)
from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])
# Display in jupyter notebook
from IPython.display import Image
Image(filename = 'tree.png')